{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4.2 文本切块"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Document(page_content='你好你好#你好你好'), Document(page_content='你好你好')]\n"
     ]
    }
   ],
   "source": [
    "text = \"你好你好#你好你好#你好你好\" # your text\n",
    "from langchain.text_splitter import CharacterTextSplitter\n",
    "text_splitter = CharacterTextSplitter(\n",
    "    separator = \"#\",\n",
    "    chunk_size = 9,\n",
    "    chunk_overlap  = 0\n",
    ")\n",
    "docs = text_splitter.create_documents([text])\n",
    "print(docs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Document(page_content='枣。瓜'), Document(page_content='。香蕉'), Document(page_content='，菠萝'), Document(page_content='蜜'), Document(page_content='。梨')]\n"
     ]
    }
   ],
   "source": [
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "text_splitter = RecursiveCharacterTextSplitter(\n",
    "    # Set a really small chunk size, just to show.\n",
    "    chunk_size = 3,\n",
    "    chunk_overlap  = 0,\n",
    "    length_function = len,\n",
    "    separators=[\"。\",\"，\",\"\"]\n",
    ")\n",
    "text =\"枣。瓜。香蕉，菠萝蜜。梨\"\n",
    "texts = text_splitter.create_documents([text])\n",
    "print(texts)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['表示两个切分文本之间的重合度，文本块之间的最大重叠量，保留一些重叠可以保持文本块之间的连续性，可以使用滑动窗口进行构造，这个比较重要。表示两个切分文本之间的重合度，文本块之间的最大重叠量，保留一些重叠可以保持文本块之间的连续性，可以使用滑动窗口进行构造，这个比较重要。']\n"
     ]
    }
   ],
   "source": [
    "text = \"表示两个切分文本之间的重合度，文本块之间的最大重叠量，保留一些重叠可以保持文本块之间的连续性，可以使用滑动窗口进行构造，这个比较重要。\"*2 # your text\n",
    "from langchain.text_splitter import NLTKTextSplitter\n",
    "text_splitter = NLTKTextSplitter(chunk_size=2,chunk_overlap=0)\n",
    "docs = text_splitter.split_text(text)\n",
    "print(docs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = \"...\" # your text\n",
    "from langchain.text_splitter import SpacyTextSplitter\n",
    "text_splitter = SpacyTextSplitter()\n",
    "docs = text_splitter.split_text(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from modelscope.outputs import OutputKeys\n",
    "from modelscope.pipelines import pipeline\n",
    "from modelscope.utils.constant import Tasks\n",
    "\n",
    "p = pipeline(\n",
    "    task=Tasks.document_segmentation,\n",
    "    model='damo/nlp_bert_document-segmentation_chinese-base')\n",
    "\n",
    "result = p(documents='......')\n",
    "print(result[OutputKeys.TEXT])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4.3 向量数据库"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(Document(page_content='FAISS is an important library'), 0.38126546), (Document(page_content='LangChain supports FAISS'), 0.5099826)]\n"
     ]
    }
   ],
   "source": [
    "from langchain.embeddings.openai import OpenAIEmbeddings\n",
    "from langchain.vectorstores import FAISS\n",
    "import os\n",
    "os.environ[\"openai_api_base\"] = \"xxx\"\n",
    "os.environ[\"openai_api_key\"] = \"xxx\"\n",
    "embeddings = OpenAIEmbeddings()\n",
    "texts = [\"FAISS is an important library\", \"LangChain supports FAISS\"]\n",
    "faiss = FAISS.from_texts(texts, embeddings)\n",
    "docs_and_scores = faiss.similarity_search_with_score(\"library\",k=2)\n",
    "print(docs_and_scores)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
